{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Finding Gun-Related News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# set up globals\n",
    "import os\n",
    "import re\n",
    "import csv\n",
    "import math\n",
    "import functools\n",
    "import xml.etree.ElementTree\n",
    "from datetime import datetime\n",
    "from itertools import chain\n",
    "from operator import itemgetter\n",
    "\n",
    "import mysql.connector\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import spacy\n",
    "\n",
    "WHITESPACE = re.compile(r'\\s+')\n",
    "NLP = spacy.load('en', parser=False, tagger=False, entity=False)\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     1,
     7,
     17,
     21
    ],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# functions for working with text\n",
    "def clean_abstract(abstract):\n",
    "    \"\"\"Remove HTML and unnecessary spaces from broadcast abstract.\"\"\"\n",
    "    # remove HTML markup\n",
    "    txt = ''.join(xml.etree.ElementTree.fromstring(abstract).itertext())\n",
    "    return WHITESPACE.sub(' ', txt).strip()\n",
    "\n",
    "def tokenize(txt, multi_words=None):\n",
    "    \"\"\"Convert text into a set of lowercase lemmas.\"\"\"\n",
    "    if multi_words is not None:\n",
    "        # these are multi-word phrases that get replaced with single tokens\n",
    "        for w in multi_words:\n",
    "            txt = re.sub(w, w.replace(' ', '_'), txt, flags=re.I)\n",
    "    return frozenset(\n",
    "        i.lemma_.lower().strip() for i in NLP(txt.lower()) if not (i.is_punct or i.is_space)\n",
    "    )\n",
    "\n",
    "def term_finder(tokens, terms):\n",
    "    \"\"\"Check whether terms appear among tokens.\"\"\"\n",
    "    return 1 if any(k in tokens for k in terms) else 0\n",
    "\n",
    "def match_regex(txt, regex):\n",
    "    return 1 if regex.search(txt) is not None else 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify News Stories\n",
    "\n",
    "Search the database for stories that use gun-related words. This group of stories becomes the initial dataset of candidate stories. These stories will then be further matched against gun-related terms to ensure the articles are truly about guns. For example, the initial search looks for \"gun\" and will match both \"gunfire\" and \"begun.\" The subsequent filtering of the stories will eliminate stories that contain \"begun\" but not \"gun\", \"gunfire\", or other gun-related words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# terms to use when searching full database for potential gun-related news\n",
    "gun_searches = (\n",
    "    'gun', 'shooter', 'shooting', 'sniper', 'firearm', 'assault weapon', 'rifle'\n",
    ")\n",
    "\n",
    "# lemmas to use when verifying news stories are about guns\n",
    "gun_keywords = (\n",
    "    'gun', 'gunfire', 'gunman', 'gunmen', 'gunmaker', 'gunpowder',\n",
    "    'gunrunner', 'gunrunning', 'gunshop', 'gunshot', 'handgun', 'shotgun',\n",
    "    'airgun', 'popgun', 'shoot', 'shooter', 'shooting', 'shootings', 'sharpshooter',\n",
    "    'sniper', 'firearm', 'assault weapon', 'rifle', 'rifleman', 'riflemen'\n",
    ")\n",
    "\n",
    "# get lemmas for each search term\n",
    "gun_lemmas = []\n",
    "for kw in gun_keywords:\n",
    "    tok = NLP(kw.replace(' ', '_'))[0].lemma_.lower().strip()\n",
    "    if tok not in gun_lemmas:\n",
    "        gun_lemmas.append(tok)\n",
    "\n",
    "gun_tokenizer = functools.partial(tokenize, multi_words=[i for i in gun_keywords if ' ' in i])\n",
    "gun_term_finder = functools.partial(term_finder, terms=gun_lemmas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Query the Database\n",
    "\n",
    "Find the initial set of candidate articles."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     0,
     9,
     10,
     34
    ],
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cnx = mysql.connector.connect(\n",
    "    option_files=os.path.expanduser('~/.my.cnf'),\n",
    "    option_groups='clienthbs',\n",
    "    database='cp_data'\n",
    ")\n",
    "\n",
    "cur = cnx.cursor(buffered=True)\n",
    "\n",
    "frames = {}\n",
    "for kw in gun_searches:\n",
    "    cur.execute(\n",
    "        \"\"\"\n",
    "        SELECT b.broadcast_id, p.dt, b.broadcast, b.abstract, b.duration, p.network, b.program_id\n",
    "\n",
    "            FROM news_broadcasts b\n",
    "\n",
    "                INNER JOIN news_programs p\n",
    "                ON p.program_id = b.program_id\n",
    "\n",
    "            WHERE p.network IN ('ABC', 'NBC', 'CBS')\n",
    "              AND p.dt BETWEEN '1989-01-01' AND '2014-12-31'\n",
    "              AND p.program LIKE '%Evening%'\n",
    "              AND (b.broadcast LIKE '%{0}%' OR b.abstract LIKE '%{0}%')\n",
    "              AND b.broadcast NOT LIKE '%Commercial%:%'\n",
    "              AND b.broadcast NOT LIKE '%Upcoming Items%'\n",
    "        \"\"\".format(kw)\n",
    "    )\n",
    "    cols = cur.column_names\n",
    "    frames[kw] = pd.DataFrame.from_records(cur, columns=cols)\n",
    "\n",
    "# gun-related news coverage\n",
    "df = pd.concat(frames)\n",
    "\n",
    "# all evening news broadcast dates (useful to know if show had broadcast)\n",
    "cur.execute(\n",
    "    \"\"\"\n",
    "    SELECT DISTINCT p.dt, p.network, p.program_id\n",
    "\n",
    "        FROM news_programs p\n",
    "\n",
    "    WHERE p.network IN ('ABC', 'NBC', 'CBS')\n",
    "    AND p.dt BETWEEN '1989-01-01' AND '2014-12-31'\n",
    "    AND p.program LIKE '%Evening%'\n",
    "    \"\"\"\n",
    ")\n",
    "cols = cur.column_names\n",
    "prog_dates = pd.DataFrame.from_records(cur, columns=cur.column_names)\n",
    "    \n",
    "cur.close()\n",
    "cnx.close()\n",
    "\n",
    "# clean up broadcast summaries\n",
    "df['abstract'] = df['abstract'].apply(clean_abstract)\n",
    "\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Look at Seconds of Coverage by Term\n",
    "\n",
    "This graph just checks that each term is actually returning possible gun-related stories."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "seconds = df.groupby(['dt', df.index.get_level_values(0)]).agg({'duration': np.sum})\n",
    "seconds = seconds.unstack(level=-1, fill_value=0).asfreq('D')\n",
    "seconds.columns = seconds.columns.get_level_values(1)\n",
    "seconds.fillna(value=0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(nrows=math.ceil(len(gun_searches) / 3), ncols=3, figsize=(26, 10))\n",
    "axs = axs.flatten()\n",
    "\n",
    "for idx, kw in enumerate(gun_searches):\n",
    "    axs[idx].plot(seconds[kw])\n",
    "    axs[idx].set_title(kw, fontdict={'fontsize': 12}, x=0.05, y=0.85, loc='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Confirm Gun-Related Stories\n",
    "\n",
    "Remove duplicates, tokenize the titles and abstracts, and confirm that each story is about guns by looking for specific gun-related terms among the tokens."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_uniq = df[\n",
    "    ['dt', 'broadcast_id', 'duration', 'broadcast', 'abstract']\n",
    "].drop_duplicates(\n",
    "    subset=['broadcast_id']\n",
    ").reset_index(drop=True)\n",
    "\n",
    "rec_count = len(df_uniq)\n",
    "print('Found {} unique records'.format(rec_count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# convert broadcast title and abstract into a set of tokens\n",
    "df_uniq['tok_broadcast'] = df_uniq['broadcast'].apply(gun_tokenizer)\n",
    "df_uniq['tok_abstract'] = df_uniq['abstract'].apply(gun_tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# confirm news stories are gun-related with better keyword searching\n",
    "# e.g. \"gun\" should not match \"begun\"\n",
    "df_uniq['gun_broadcast'] = df_uniq['tok_broadcast'].apply(gun_term_finder)\n",
    "df_uniq['gun_abstract'] = df_uniq['tok_abstract'].apply(gun_term_finder)\n",
    "\n",
    "print('Titles mentioning guns: ', df_uniq['gun_broadcast'].aggregate(sum), 'of', rec_count)\n",
    "print('Abstracts mentioning guns: ', df_uniq['gun_abstract'].aggregate(sum), 'of', rec_count)\n",
    "df_uniq[df_uniq['gun_broadcast'] == 1][['broadcast_id', 'broadcast', 'abstract']].head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Identifying International News\n",
    "\n",
    "We do not want the data to include stories about wars or foreign events. Therefore, we compile a list of countries and use the appearance of a country name in the broadcast title or abstract as an indication that the story is not about U.S. events.\n",
    "\n",
    "After loading the list of countries we check the database for articles about each and sort the countries from most to least frequent. This makes subsequent searches for the country names in the news stories much faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     0,
     5
    ],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('in-data/country_search.tsv', 'r') as fh:\n",
    "    rdr = csv.DictReader(fh, delimiter='\\t')\n",
    "    country_names = [i['name'].strip() for i in rdr]\n",
    "\n",
    "# do not mark these countries because they mostly refer to U.S. locations\n",
    "for loc in ('United States', 'Georgia', 'South Georgia', 'Jersey'):\n",
    "    try:\n",
    "        country_names.remove(loc)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     0,
     1,
     9
    ],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# country count\n",
    "cnx = mysql.connector.connect(\n",
    "    option_files=os.path.expanduser('~/.my.cnf'),\n",
    "    option_groups='clienthbs',\n",
    "    database='cp_data'\n",
    ")\n",
    "\n",
    "cur = cnx.cursor(buffered=True)\n",
    "country_counts = []\n",
    "for k in country_names:\n",
    "    cur.execute(\n",
    "        \"\"\"\n",
    "        SELECT COUNT(1) FROM news_broadcasts b\n",
    "        WHERE b.broadcast LIKE \"%{}%\"\n",
    "        \"\"\".format(k)\n",
    "    )\n",
    "    country_counts.append((k, cur.fetchall()[0][0]))\n",
    "    \n",
    "cur.close()\n",
    "cnx.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "# sort list of countries with most-common first\n",
    "country_names = [i for i, k in sorted(country_counts, key=itemgetter(1), reverse=True)]\n",
    "print('Most common countries: ', ', '.join(country_names[:5]))\n",
    "\n",
    "# create pattern to find countries surrounded by word boundaries\n",
    "re_country = re.compile(r\"\\b({})\\b\".format('|'.join(country_names)))\n",
    "find_countries = functools.partial(match_regex, regex=re_country)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find broadcast titles and abstracts that mention foreign countries\n",
    "df_uniq['intl_broadcast'] = df_uniq['broadcast'].apply(find_countries)\n",
    "df_uniq['intl_abstract'] = df_uniq['abstract'].apply(find_countries)\n",
    "\n",
    "print('Titles with countries: ', df_uniq['intl_broadcast'].aggregate(sum), 'of', rec_count)\n",
    "print('Abstracts with countries: ', df_uniq['intl_abstract'].aggregate(sum), 'of', rec_count)\n",
    "df_uniq[df_uniq['intl_broadcast'] == 1][['broadcast_id', 'broadcast', 'abstract']].head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save output of all broadcasts with indicators for international news and mentions of gun-related terms."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_uniq.to_csv('out-data/gun_broadcasts.tsv', sep='\\t', encoding='utf-8', index=False)\n",
    "prog_dates.to_csv('out-data/program_dates.tsv', sep='\\t', encoding='utf-8', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Aggregate Gun-Related News Coverage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tot = df_uniq[\n",
    "    # exclude international news\n",
    "    (\n",
    "        (df_uniq['intl_broadcast'] == 0) & (df_uniq['intl_abstract'] == 0)\n",
    "    ) &\n",
    "    # find confirmed gun-related stories\n",
    "    (\n",
    "        (df_uniq['gun_broadcast'] == 1) | (df_uniq['gun_abstract'] == 1)\n",
    "    )\n",
    "].groupby('dt').agg(\n",
    "    {'duration': np.sum}\n",
    ").asfreq('D')\n",
    "\n",
    "all_days = pd.DataFrame({\n",
    "     'date': pd.date_range('1989', '2015', freq='D')\n",
    "})\n",
    "\n",
    "tot = pd.merge(all_days, tot, how='left', left_on='date', right_index=True, validate='one_to_one')\n",
    "\n",
    "tot.fillna(value=0, inplace=True)\n",
    "tot.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "Get days of mass shootings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shooting_days = pd.read_stata('in-data/nonfam_shootings.dta')\n",
    "shooting_days = shooting_days.groupby('date').agg({'fatalities': 'sum', 'victims': 'sum'})\n",
    "\n",
    "shootings = pd.merge(all_days, shooting_days, how='left', left_on=['date'], right_index=True, validate='one_to_one')\n",
    "shootings.fillna(value={'fatalities': 0, 'victims': 0}, inplace=True)\n",
    "\n",
    "shootings['shooting'] = shootings.fatalities > 0\n",
    "shootings.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(18, 6))\n",
    "\n",
    "for d, row in shootings.iterrows():\n",
    "    if row.fatalities >= 10:\n",
    "        ax.axvline(row.date.date(), color=(.8, .4, .4), lw=2)\n",
    "    elif row.fatalities > 0:\n",
    "        ax.axvline(row.date.date(), color=(.9, .9, .9), lw=2)\n",
    "    else:\n",
    "        pass\n",
    "\n",
    "ax.plot(tot['date'], tot['duration'], c='black')\n",
    "\n",
    "ax.set_title('Gun-Related News Coverage', fontdict={'fontsize': 16})\n",
    "ax.set_xlabel('Date')\n",
    "ax.set_ylabel('Seconds')\n",
    "ax.legend=False\n",
    "\n",
    "dates = pd.date_range('1990', '2015', freq=pd.DateOffset(years=2))\n",
    "ax.xaxis.set_ticks(dates)\n",
    "ax.xaxis.set_ticklabels(dates.strftime('%Y'));\n",
    "ax.yaxis.set_ticks([i*500 for i in range(9)]);\n",
    "ax.margins(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = pd.merge(shootings, tot, on=['date'])\n",
    "tmp.reindex\n",
    "tmp['lag_fatal'] = tmp['fatalities'].shift(1)\n",
    "tmp['lag_shooting'] = tmp['shooting'].shift(1)\n",
    "tmp.loc[tmp.duration > 2000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# convert time series of shooting coverage to a Stata dataset\n",
    "tot.to_stata(\n",
    "    'out-data/shooting_news.dta',\n",
    "    convert_dates={'date': 'td'},\n",
    "    write_index=False\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "4px",
    "width": "254px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
